library(tidyverse)
library(magrittr)
library(lubridate)
library(scales)
library(matrixStats)
library(ggrepel)
library(broom)
library(glue)
library(jsonlite)
library(rvest)
library(RCurl)
library(pander)
library(plotly)
panderOptions("big.mark", ",")
panderOptions("table.split.table", Inf)
panderOptions("table.style", "rmarkdown")
theme_set(theme_bw())
Disclaimer: This very simple report was prepared by a bioinformatician with no experience in epidemiology or virology, and as such should be treated simply as an alternate viewpoint on the data, which I was simply unable to find elsewhere. Many other people exist with much greater expertise on this subject. However, I do hope this provides a useful perspective which is able to add constructively to the wider discussion. In addition, it should be noted that this is very much focussed on Australian data.
confirmed <- url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv") %>%
read_csv() %>%
pivot_longer(
cols = ends_with("20"),
names_to = "date",
values_to = "confirmed"
) %>%
mutate(
date = str_replace_all(
date, "(.+)/(.+)/(.+)", "20\\3-\\1-\\2"
) %>%
ymd()
) %>%
dplyr::rename(
Country = `Country/Region`
) %>%
dplyr::mutate(
Country = case_when(
`Province/State` == "Hubei" ~ "China (Hubei)",
`Province/State` == "Hong Kong" ~ "Hong Kong",
grepl("China", Country) & !`Province/State` %in% c("Hubei", "Hong Kong") ~ "China (Other)",
Country == "Korea, South" ~ "South Korea",
!grepl("China", Country) ~ Country
)
) %>%
dplyr::filter(
!is.na(confirmed)
)
govUrl <- "https://www.health.gov.au/news/health-alerts/novel-coronavirus-2019-ncov-health-alert/coronavirus-covid-19-current-situation-and-case-numbers"
govLines <- govUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all("//div[contains(@class, 'health-table')]") %>%
html_text()
govTable <- c(
"Queensland",
"Australian Capital Territory",
"New South Wales",
"Victoria",
"Tasmania",
"South Australia",
"Western Australia",
"Northern Territory"
) %>%
vapply(function(x){
str_replace_all(govLines, "(\\\n)+", "_") %>%
str_extract(paste(x, "[0-9,]+", sep = "_")) %>%
str_remove(",") %>%
str_remove(".+_")
},
character(1)) %>%
enframe(
name = "Province/State", value = "confirmed"
) %>%
mutate(confirmed = as.integer(confirmed)) %>%
dplyr::filter(!is.na(confirmed)) %>%
mutate(
date = ymd(Sys.Date()),
Country = "Australia"
)
prevDateAU <- confirmed %>%
dplyr::filter(Country == "Australia") %>%
summarise_at(vars(date), max) %>%
.[["date"]]
latestAU <- fromJSON("https://api.infotorch.org/api/covid19/totals/") %>%
as_tibble() %>%
rename(
`Province/State` = state_long,
date = date_updated
) %>%
mutate(
date = ymd(date),
Country = "Australia",
`Province/State` = str_replace_all(
`Province/State`, "ACT", "Australian Capital Territory"
)
) %>%
dplyr::filter(
!str_detect(`Province/State`, "Total")
) %>%
dplyr::select(
`Province/State`, Country, date,
confirmed, deaths, tested
) %>%
bind_rows(govTable) %>%
dplyr::filter(date > prevDateAU) %>%
group_by(`Province/State`) %>%
summarise(
Country = unique(Country),
date = max(date),
confirmed = max(confirmed),
deaths = max(deaths, na.rm = TRUE),
tested = max(tested, na.rm = TRUE)
) %>%
mutate(
tested = ifelse(is.finite(tested), tested, NA),
deaths = ifelse(is.finite(deaths), deaths, NA)
)
# Update the AU records if there are any timepoints
# on days beyond the last value provided by JHU
updateAU <- max(latestAU$date) > prevDateAU
if (updateAU) {
confirmed %<>%
bind_rows(
dplyr::select(latestAU, any_of(colnames(.)))
)
n <- fromJSON("https://api.infotorch.org/api/covid19/totals/") %>%
dplyr::filter(date_updated == max(date_updated), state != "AU") %>%
nrow()
}
recovered <- url("https://raw.githubusercontent.com/ulklc/covid19-timeseries/master/countryReport/raw/rawReport.csv") %>%
read_csv() %>%
rename(
date = day,
Country = countryName
) %>%
mutate(
Country = str_replace_all(Country, "United States", "US")
) %>%
dplyr::select(
Country, Lat = lat, Long = lon, date,
recovered
)
if (updateAU){
recovered %<>%
bind_rows(
latestAU %>%
distinct(Country, date)
) %>%
arrange(Country, date) %>%
mutate(recovered = zoo::na.locf(recovered))
}
deaths <- url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv") %>%
read_csv() %>%
pivot_longer(
cols = ends_with("20"),
names_to = "date",
values_to = "deaths"
) %>%
mutate(
date = str_replace_all(
date, "(.+)/(.+)/(.+)", "20\\3-\\1-\\2"
) %>%
ymd()
) %>%
dplyr::rename(
Country = `Country/Region`
) %>%
dplyr::mutate(
Country = case_when(
`Province/State` == "Hubei" ~ "China (Hubei)",
`Province/State` == "Hong Kong" ~ "Hong Kong",
grepl("China", Country) & !`Province/State` %in% c("Hubei", "Hong Kong") ~ "China (Other)",
Country == "Korea, South" ~ "South Korea",
!grepl("China", Country) ~ Country
)
)
if (updateAU){
deaths %<>%
bind_rows(
dplyr::select(latestAU, any_of(colnames(.)))
) %>%
mutate(deaths = zoo::na.locf(deaths))
}
hb <- 59170000
hk <- 7479307
pops <- tribble(
~Country, ~Population,
"Australia", 25499884,
"Austria", 9006398,
"Belgium", 11575214,
"Brazil", 212129490,
"Canada", 37742154,
"Chile", 19068026,
"China (Hubei)", hb,
"China (Other)", 1408526449 - hb,
"Czechia", 10703010,
"Denmark", 5786274,
"Ecuador", 17564089,
"Finland", 5538181,
"France", 65273511,
"Germany", 83783942,
"Greece", 10423054,
"Hong Kong", hk,
"Iceland", 364260,
"India", 1376270475,
"Indonesia", 272702678,
"Italy", 60486925,
"Iran", 83677594,
"Ireland", 4921810,
"Israel", 8615601,
"Japan", 126476461,
"Jordan", 10174146,
"Luxembourg", 613894,
"Malaysia", 32245488,
"Netherlands", 17134872,
"New Zealand", 4811065,
"Norway", 5408930,
"Pakistan", 219629013,
"Poland", 37858096,
"Portugal", 10196709,
"Qatar", 2866531,
"Romania", 19271798,
"Russia", 145917069,
"Saudi Arabia", 34661353 ,
"Singapore", 5836728,
"Slovenia", 2078860,
"South Korea", 51269185,
"Spain", 46754778,
"Sweden", 10081035,
"Switzerland", 8654622,
"Taiwan*", 23804524,
"Thailand", 69752040,
"Turkey", 84081383,
"United Arab Emirates", 9856053,
"United Kingdom", 67886011,
"US", 331002651
)
Data for confirmed cases and fatalities was primarily sourced from Johns Hopkins University (https://coronavirus.jhu.edu/), using the datasets provided at https://github.com/CSSEGISandData/COVID-19. JHU data is updated every 24 hours at approximately 23:59(UTC), which is about 10:30AM in Adelaide.
Data for recoveries was sourced from https://github.com/ulklc/covid19-timeseries as JHU no longer support this statistic, given concerns regarding reliable figures. This data is not provided by State/Province and any information using these figures cannot be broken down by region, which is of particular note for China and Hubei Province.
Live hourly updates for Australia are available from https://covid-19-au.github.io/ for those who would like an up to the minute breakdown of confirmed cases. Numbers used for generation of this page are updated periodically throughout the day using values provided by https://api.infotorch.org/api/covid19/totals/ and those at www.health.gov.au. The Australian numbers used below do contain confirmed cases reported since the JHU dataset was updated. As of 21:19, 8 of 7 states/territories have released updated COVID-19 figures since the overnight JHU release. The current Australia total is 2,801 using these data sources, and any corrections to these will be made as soon as is possible.
Population sizes for the most impacted countries were manually obtained from https://www.worldometers.info/ and should not be considered as authoritative. Given the disparity of infection within China, China was broken into Hubei Province, Hong Kong and the rest of China, with the exception of recovered cases, for reasons given above. As an open acknowledgement of the crudeness of population values, population estimates for Hubei Province were taken from the 2018 estimates given by Statista.com. This particular population estimate is likely to be low, and as a result confirmed case rates for Hubei province may be an overestimate, and consequently, confirmed case rates for the rest of China may also be an underestimate.
Confirmed cases of COVID-19 as provided by the Chinese Government have been discussed elsewhere as unusual, and data appears potentially unreliable. In this analysis, discussions regarding accurate Chinese reporting are not considered further and data is simply presented as supplied by JHU.
However, all countries are likely to contain many unreported cases given the incomplete testing regimes in place for most countries. Similarly, reporting in many countries may have features that cause concerns regarding data integrity and this makes comparison across countries difficult.
startingPoint <- 4
minPop <- 4e6
For this section, all data is presented relative to population size. Data is only shown for countries with a population larger than 4,000,000. In addition, growth in infection rates is only shown after the point at which the cumulative confirmed infection rate breached 4 confirmed cases / million. This equates to about 102 confirmed cases within Australia, and is broadly comparable to the “Days since passing 100 confirmed cases” commonly shown elsewhere.
Confirmed cases in this table are effectively the cumulative, confirmed incidence rate. Recovered patients and those who have passed away are still included in these numbers.
confirmed %>%
group_by(Country, date) %>%
summarise(confirmed = sum(confirmed)) %>%
ungroup() %>%
group_by(Country) %>%
dplyr::filter(
date == max(date),
) %>%
ungroup() %>%
right_join(pops) %>%
dplyr::filter(
Population > minPop
) %>%
mutate(
rate = 1e6*confirmed / Population
) %>%
arrange(desc(rate)) %>%
dplyr::slice(1:25) %>%
mutate(
rate = sprintf("%.1f", rate),
Population = comma(Population)
) %>%
rename_at(vars(everything()), str_to_title) %>%
dplyr::select(
Country, Date, Confirmed, Population, Rate
) %>%
rename(`Rate (Cases per Million)` = Rate) %>%
pander(
justify = "lrrrr",
caption = paste(
"The", nrow(.), "most impacted countries studied here and shown as a proportion of total population.",
"Only countries with a population greater than",
comma(minPop), "are shown.",
"The final column provides the latest confirmed infection rate as cases per million people.",
"Whilst the virus spreads with no regard to population size, the rate as shown here indicates the __degree of stress which each country's health-care system is likely to be experiencing__.",
"Several countries shown here have not attracted much media attention due lower case numbers than China and Italy, but are likely to be experiencing significant duress."
)
)
| Country | Date | Confirmed | Population | Rate (Cases per Million) |
|---|---|---|---|---|
| Switzerland | 2020-03-25 | 10,897 | 8,654,622 | 1259.1 |
| Italy | 2020-03-25 | 74,386 | 60,486,925 | 1229.8 |
| China (Hubei) | 2020-03-25 | 67,801 | 59,170,000 | 1145.9 |
| Spain | 2020-03-25 | 49,515 | 46,754,778 | 1059.0 |
| Austria | 2020-03-25 | 5,588 | 9,006,398 | 620.4 |
| Norway | 2020-03-25 | 3,084 | 5,408,930 | 570.2 |
| Germany | 2020-03-25 | 37,323 | 83,783,942 | 445.5 |
| Belgium | 2020-03-25 | 4,937 | 11,575,214 | 426.5 |
| France | 2020-03-25 | 25,600 | 65,273,511 | 392.2 |
| Netherlands | 2020-03-25 | 6,438 | 17,134,872 | 375.7 |
| Iran | 2020-03-25 | 27,017 | 83,677,594 | 322.9 |
| Denmark | 2020-03-25 | 1,862 | 5,786,274 | 321.8 |
| Ireland | 2020-03-25 | 1,564 | 4,921,810 | 317.8 |
| Portugal | 2020-03-25 | 2,995 | 10,196,709 | 293.7 |
| Israel | 2020-03-25 | 2,369 | 8,615,601 | 275.0 |
| Sweden | 2020-03-25 | 2,526 | 10,081,035 | 250.6 |
| US | 2020-03-25 | 65,778 | 331,002,651 | 198.7 |
| South Korea | 2020-03-25 | 9,137 | 51,269,185 | 178.2 |
| Finland | 2020-03-25 | 880 | 5,538,181 | 158.9 |
| Czechia | 2020-03-25 | 1,654 | 10,703,010 | 154.5 |
| United Kingdom | 2020-03-25 | 9,640 | 67,886,011 | 142.0 |
| Australia | 2020-03-26 | 2,801 | 25,499,884 | 109.8 |
| Singapore | 2020-03-25 | 631 | 5,836,728 | 108.1 |
| Canada | 2020-03-25 | 3,251 | 37,742,154 | 86.1 |
| Greece | 2020-03-25 | 821 | 10,423,054 | 78.8 |
minDays <- 5
# Use Singapore as that has the longest dataset besides Hubei
nDays <- confirmed %>%
dplyr::filter(Country == "Singapore") %>%
group_by(Country, date) %>%
summarise(confirmed = sum(confirmed)) %>%
ungroup() %>%
left_join(pops) %>%
mutate(
rate = 1e6*confirmed / Population
) %>%
dplyr::filter(rate > startingPoint) %>%
nrow() %>%
subtract(1)
refRate <- c(2, 4, 8)
p <- confirmed %>%
group_by(Country, date) %>%
summarise(confirmed = sum(confirmed)) %>%
ungroup() %>%
right_join(
dplyr::filter(pops, Population > minPop)
) %>%
mutate(
rate = 1e6*confirmed / Population
) %>%
dplyr::filter(
rate > startingPoint
) %>%
group_by(Country) %>%
mutate(
days = date - min(date)
) %>%
dplyr::filter(
max(days) >= minDays
) %>%
ungroup() %>%
mutate(
days = as.integer(days),
rate = round(rate, 2)
) %>%
dplyr::filter(days <= nDays) %>%
arrange(date) %>%
mutate(Country = fct_inorder(Country)) %>%
rename_all(str_to_title) %>%
ggplot(
aes(Days, Rate, colour = Country, Date = Date, Confirmed = Confirmed)
) +
geom_segment(
aes(x, y, xend = xmax, yend = ymax),
data = tibble(
x = 0,
y = startingPoint,
xmax = c(16, 32, nDays ),
ymax = startingPoint*2^(xmax / refRate)
),
inherit.aes = FALSE,
colour = "grey70",
linetype = 2
) +
geom_text(
aes(xmax, ymax, label = label),
data = tibble(
xmax = c(16, 32, nDays - 2),
ymax = startingPoint*2^(xmax / refRate),
label = glue("Doubling in\n {refRate} days")
),
colour = "grey70",
inherit.aes = FALSE
) +
geom_vline(
aes(xintercept = Days),
data = . %>%
dplyr::filter(Country == "Australia") %>%
dplyr::filter(Days == max(Days)),
linetype = 3,
colour = "blue",
size = 1/3
) +
geom_hline(
aes(yintercept = Rate),
data = . %>%
dplyr::filter(Country == "Australia") %>%
dplyr::filter(Days == max(Days)),
linetype = 3,
colour = "blue",
size = 1/3
) +
geom_point() +
geom_line() +
scale_x_continuous(
expand = expand_scale(mult = c(0, 0.05)),
) +
scale_y_log10(
expand = expand_scale(mult = c(0, 0.05))
) +
xlab(
paste(
"Days since passing",
startingPoint,
"confirmed cases/million"
)
) +
ylab("Confirmed Cumulative Infection Rate (cases/million)")
ggplotly(
p,
tooltip = c(
"Days", "Rate", "Country", "Date", "Confirmed"
))
COVID-19 Confirmed Cumulative Infection Rate for countries which have exceeded 4 confirmed cases/million for 5 or more days, and with populations greater than 4,000,000. Data is only shown for the first 50 calendar days since passing 4 confirmed cases/million. Note that from the day records begin in this dataset (2020-01-22), the confirmed infection rate in Hubei was 7.5 confirmed cases/million. The blue dashed line indicates Australia’s current position on this figure. Diagonal grey lines indicate a doubling in the infection rate every 2, 4, or 8 days. To hide a country, click on the country in the plot legend. Clicking again on the country in the legend will restore the data within the plot. Countries are shown in order of the date at which they passed the 4 confirmed case/million mark. Due to the large number of countries shown, you may need to scroll through the legend. Regions of the plot are also able to be zoomed interactively. Please note the y-axis is shown on the logarithmic scale, so that a series of points which appear to be diagonal will indicate exponential growth. The flatter the line, the slower the growth and a perfectly horizontal line would indicate zero growth, or no new confirmed cases.
All figures and tables presented here simply aim to show an alternative viewpoint on the data. Every way to view COVID-19 data will mask important features, and the values shown here do not take into account vital factors such as population density, variability of infection across regions within countries, social culture and demographics. Many countries may not be directly comparable for a combination of the above factors. It is simply to view the data through the lens of a country’s population size using a value which should be easily interpretable.
In the above plot:
As an alternative viewpoint, the numbers of recovered and deceased patients have been removed from the above plot to provide an estimate of the currently active infections. Given the changes made to JHU data for recovered cases (made on 23-03-2020), data for China is no longer able to be separated into Hubei Province and the remainder of China, and is presented as one set of data for this plot.
p2 <- confirmed %>%
left_join(deaths) %>%
mutate(
Country = str_replace_all(Country, "China.+", "China")
) %>%
right_join(
pops %>%
mutate(
Country = str_replace_all(Country, "China.+", "China")
) %>%
group_by(Country) %>%
summarise_at(vars(Population), sum) %>%
dplyr::filter(Population > minPop)
) %>%
group_by(Country, date, Population) %>%
summarise_at(vars(confirmed, deaths), sum) %>%
left_join(recovered ) %>%
mutate(
active = confirmed - recovered - deaths
) %>%
dplyr::filter(
!is.na(active), confirmed > 0
) %>%
mutate(rate = 1e6 * active / Population) %>%
dplyr::filter(
rate > startingPoint
) %>%
group_by(Country) %>%
mutate(
days = date - min(date)
) %>%
dplyr::filter(max(days) > minDays) %>%
ungroup() %>%
mutate(
days = as.integer(days),
rate = round(rate, 2)
) %>%
dplyr::filter(days <= nDays) %>%
arrange(date) %>%
mutate(Country = fct_inorder(Country)) %>%
rename_all(str_to_title) %>%
ggplot(
aes(
x = Days, y = Rate, colour = Country,
Date = Date, Active = Active,
Confirmed = Confirmed, Recovered = Recovered,
Deaths = Deaths
)
) +
geom_segment(
aes(x, y, xend = xmax, yend = ymax),
data = tibble(
x = 0,
y = startingPoint,
xmax = c(20.5, 41, nDays),
ymax = startingPoint + 2^(xmax/ c(2, 4, 8))
),
inherit.aes = FALSE,
colour = "grey70",
linetype = 2
) +
geom_text(
aes(xmax, ymax, label = label),
data = tibble(
xmax = c(20.5, 41, nDays),
rate = c(2, 4, 8),
ymax = 2^(xmax/ rate),
label = glue("Double in\n {rate} days")
),
colour = "grey70",
inherit.aes = FALSE
) +
geom_vline(
aes(xintercept = Days),
data = . %>%
dplyr::filter(Country == "Australia") %>%
dplyr::filter(Days == max(Days)),
linetype = 3,
colour = "blue",
size = 1/3
) +
geom_hline(
aes(yintercept = Rate),
data = . %>%
dplyr::filter(Country == "Australia") %>%
dplyr::filter(Days == max(Days)),
linetype = 3,
colour = "blue",
size = 1/3
) +
geom_point() +
geom_line() +
scale_x_continuous(
expand = expand_scale(mult = c(0, 0.05)),
) +
scale_y_log10(
expand = expand_scale(mult = c(0, 0.05))
) +
xlab(
paste(
"Days since passing",
startingPoint,
"confirmed active cases/million"
)
) +
ylab("Confirmed Active Infection Rate (cases/million)")
ggplotly(p2)
Confirmed active cases of COVID-19 for countries where the confirmed infection rate has exceeded 4 confirmed active cases/million for more than 50 calendar days. Only countries with a population greater than 4,000,000 are shown for better visualisation. Due to difficulties introduced by the currently reported low active infection rate outside Hubei province, data from China has been excluded from this plot, with the exception of Hubei and Hong Kong. The blue dashed lines indicates Australia’s current position on this figure. To hide a country, click on the country in the plot legend. Clicking again on the country in the legend will restore the data within the plot. Countries are shown in order of the date at which they passed the 4 confirmed active case/million mark. Due to the number of countries shown, you may need to scroll through the legend. Regions of the plot are also able to be zoomed interactively. Please note the y-axis is shown on the logarithmic scale, so that a series of points which appear to be diagonal will indicate exponential growth/decay. Given the different starting point to the previous plot, data will generally be shown for fewer time-points.
Notable features of this perspective are:
In order to summarise which countries are the most similar to each other, a Principal Component Analysis was performed. This enables the multi-dimensional data of the above plots to summarised in two dimensions. As missing data cannot be included in this analysis, several countries which are at earlier comparative time-points than Australia were omitted from this analysis.
nDays <- confirmed %>%
dplyr::filter(Country == "Australia") %>%
group_by(Country, date) %>%
summarise(confirmed = sum(confirmed)) %>%
ungroup() %>%
left_join(pops) %>%
mutate(
rate = 1e6*confirmed / Population
) %>%
dplyr::filter(rate > startingPoint) %>%
nrow() %>%
subtract(1)
pca <- confirmed %>%
group_by(Country, date) %>%
summarise(confirmed = sum(confirmed)) %>%
ungroup() %>%
right_join(pops) %>%
mutate(
rate = 1e6*confirmed / Population
) %>%
dplyr::filter(
rate > startingPoint,
Population > minPop
) %>%
group_by(Country) %>%
mutate(
days = date - min(date)
) %>%
dplyr::filter(
max(days) >= nDays
) %>%
ungroup() %>%
mutate(
days = as.integer(days),
rate = round(rate, 2)
) %>%
dplyr::filter(days <= nDays) %>%
dplyr::select(Country, rate, days) %>%
pivot_wider(
values_from = rate,
names_from = days
) %>%
as.data.frame() %>%
column_to_rownames("Country") %>%
as.matrix() %>%
.[!rowAnyNAs(.),] %>%
log() %>%
prcomp()
set.seed(101)
pca$x %>%
as.data.frame() %>%
rownames_to_column("Country") %>%
mutate(
Cluster = cbind(PC1, PC2) %>%
kmeans(centers = k, nstart = 10) %>%
.[["cluster"]] %>%
as.factor()
) %>%
ggplot(aes(PC1, PC2, label = Country, colour = Cluster)) +
geom_point() +
geom_text_repel(
show.legend = FALSE
) +
stat_ellipse(
aes(fill = Cluster),
colour = NA,
geom = "polygon",
alpha = 0.1
) +
xlab(
paste0(
"PC1 (", pcPercent[['PC1']],")"
)
) +
ylab(
paste0(
"PC2 (", pcPercent[['PC2']], ")"
)
) +
theme(
legend.position = "none"
)
Dimensional reduction showing which countries infection rates are the most similar to each other at the 16 day time point, after passing 4 confirmed cases/million. This may or may not be indicative of future spread within the population. The value 16 days was simply chosen as this marks how long since Australia passed this threshold. Countries which have not progressed beyond 4 confirmed cases/million for 16 days or more are not shown. Countries with populations < 4,000,000 are also excluded. Clusters were generated using k-means, manually specifying k = 4 and are not definitive. Principal Component 1, on the x-axis, corresponds to the greatest source of variability within the data (93.7%), and countries which appear near each other along this axis can be assumed to be showing similar growth in confirmed infection rates at this time point. Separation along the y-axis is less significant, but also worthy of note, as this represents 3.7% of variability within the data. At this early point, Australia’s cumulative, confirmed infection rate is diverging from the cluster of countries which have responded well and is becoming more similar to Israel, the UK and other poor responding countries. This is suggestive that the early measures instituted in Australia may have been inadequate.
All rates presented in this section do not take population size into account, but simply look at the rates of recovery and fatalities within each country’s cohort.
fr <- confirmed %>%
inner_join(deaths) %>%
group_by(Country) %>%
dplyr::filter(date == max(date)) %>%
ungroup() %>%
summarise(fr = sum(deaths) / sum(confirmed)) %>%
.[["fr"]]
rr <- confirmed %>%
mutate(
Country = str_replace_all(
Country, "China.+", "China"
)
) %>%
right_join(
pops %>%
mutate(
Country = str_replace_all(
Country, "China.+", "China"
)
) %>%
group_by(Country) %>%
summarise_at(vars(Population), sum)
) %>%
group_by(Country, Population, date) %>%
summarise_at(vars(confirmed), sum) %>%
left_join(recovered) %>%
dplyr::filter(!is.na(recovered)) %>%
group_by(Country) %>%
dplyr::filter(date == max(date)) %>%
ungroup() %>%
summarise(rr = sum(recovered) / sum(confirmed)) %>%
.[["rr"]]
Summarising all available data from all countries:
p4 <-confirmed %>%
left_join(deaths) %>%
mutate(
Country = str_replace_all(Country, "China.+", "China")
) %>%
group_by(Country, date) %>%
summarise_at(vars(confirmed, deaths), sum) %>%
dplyr::filter(date == max(date)) %>%
right_join(
pops %>%
mutate(
Country = str_replace_all(Country, "China.+", "China")
) %>%
group_by(Country) %>%
summarise_at(vars(Population), sum)
) %>%
dplyr::filter(
1e6*confirmed / Population > startingPoint,
) %>%
ungroup() %>%
left_join(recovered) %>%
mutate(
active = confirmed - recovered - deaths
) %>%
mutate(
active = 100*active / confirmed,
recovered = 100*recovered / confirmed,
fatalities = 100*deaths / confirmed
) %>%
dplyr::filter(active < 100) %>%
arrange(desc(confirmed)) %>%
mutate(Country = fct_inorder(Country)) %>%
pivot_longer(
cols = c(active, recovered, fatalities),
names_to = "Status",
values_to = "Percentage"
) %>%
mutate(
Status = str_to_title(Status),
Status = factor(
Status,
levels = c("Active", "Recovered", "Fatalities")
),
Percentage = round(Percentage, 2)
) %>%
mutate(confirmed = comma(confirmed)) %>%
rename(Confirmed = confirmed) %>%
ggplot(
aes(
Country, Percentage,
fill = Status, cases = Confirmed
)
) +
geom_col() +
scale_fill_manual(
values = c(
Active = "blue",
Recovered = "green",
Fatalities = "red"
)
) +
scale_y_continuous(expand = expand_scale(0, 0)) +
coord_flip() +
labs(x = c()) +
theme(
legend.position = "none"
)
ggplotly(p4)
Fatality, Recovery and Active Infection rates for countries which have exceeded 4 confirmed cases / million. Countries are shown in order of the number of confirmed cases.
minDays <- 9
df <- confirmed %>%
inner_join(deaths) %>%
group_by(Country, date) %>%
summarise_at(
vars(confirmed, deaths),
sum
) %>%
right_join(pops) %>%
ungroup() %>%
mutate(
`Infection Rate` = 1e6 * confirmed / Population
) %>%
dplyr::filter(
`Infection Rate` > startingPoint
) %>%
group_by(Country) %>%
mutate(
Days = date - min(date),
n = n()
) %>%
dplyr::filter(
n > minDays,
max(deaths, na.rm = TRUE) > 0
) %>%
mutate(
Rate = deaths / confirmed,
`Fatality Rate` = percent(Rate, accuracy = 0.1),
minusT = date - max(date)
) %>%
ungroup()
plotFr <- mutate(
df,
Country = factor(
Country,
levels = df %>%
dplyr::select(Country, minusT, Rate) %>%
pivot_wider(
id_cols = Country,
names_from = minusT,
values_from = Rate
) %>%
as.data.frame() %>%
column_to_rownames("Country") %>%
dist() %>%
hclust() %>%
as.dendrogram() %>%
labels()
)
) %>%
rename_all(str_to_title) %>%
ggplot(
aes(
x = Days, y = Country, fill = Rate,
conf = Confirmed,
deaths = Deaths,
date = Date,
label = `Fatality Rate`
)
) +
geom_raster() +
geom_vline(
aes(xintercept = Days + 0.5),
data = . %>%
dplyr::filter(Country == "Australia") %>%
dplyr::filter(Date == max(Date)),
linetype = 3,
size = 1/3,
colour = "grey70"
) +
scale_fill_viridis_c(
option = "magma",
breaks = seq(0, 0.08, by = 0.02)
) +
scale_x_continuous(
expand = expand_scale(0, 0),
labels = abs
) +
scale_y_discrete(expand = expand_scale(0, 0)) +
labs(
x = glue(
"Days since passing {startingPoint} cases/million"
),
y = c(),
fill = "Fatality\nRate"
) +
theme(
panel.grid = element_blank()
)
cpFr <- glue(
"*Fatality Rate for confirmed cases after passing {startingPoint} confirmed cases / million.
Only countries with {minDays} days of data beyond this time-point are shown, and are shown after clustering based on the most recent values.
Countries with no recorded fatalities have been excluded.
The dashed grey line indicates the time-point Australia is currently at.
A clear trend of an increasing fatality rate with time is evident in many countries (e.g. Spain, France, Italy, UK), however, for some countries (e.g. Singapore) this rate appears relatively stable throughout the majority of days.
The overall Fatality Rate for confirmed cases is currently ({percent(fr, accuracy = 0.1)}).*"
)
ggplotly(
plotFr,
tooltip = c(
"Country", "Date", "Days", "Confirmed", "Deaths",
"Fatality Rate"
)
)
Fatality Rate for confirmed cases after passing 4 confirmed cases / million. Only countries with 9 days of data beyond this time-point are shown, and are shown after clustering based on the most recent values. Countries with no recorded fatalities have been excluded. The dashed grey line indicates the time-point Australia is currently at. A clear trend of an increasing fatality rate with time is evident in many countries (e.g. Spain, France, Italy, UK), however, for some countries (e.g. Singapore) this rate appears relatively stable throughout the majority of days. The overall Fatality Rate for confirmed cases is currently (4.5%).
minDays <- 9
df <- confirmed %>%
left_join(deaths) %>%
mutate(
Country = str_replace_all(Country, "China.+", "China")
) %>%
group_by(Country, date) %>%
summarise_at(vars(confirmed, deaths), sum) %>%
left_join(recovered) %>%
group_by(Country, date) %>%
summarise_at(
vars(confirmed, recovered, deaths),
sum
) %>%
ungroup() %>%
right_join(
pops %>%
mutate(
Country = str_replace_all(Country, "China.+", "China")
) %>%
group_by(Country) %>%
summarise_at(vars(Population), sum)
) %>%
mutate(
active = confirmed - deaths - recovered,
`Infection Rate` = 1e6 * confirmed / Population
) %>%
dplyr::filter(`Infection Rate` > startingPoint) %>%
group_by(Country) %>%
mutate(
Days = date - min(date),
ratio = round(log2(recovered / deaths), 2),
) %>%
dplyr::filter(
max(Days) > minDays
) %>%
ungroup() %>%
dplyr::filter(
!is.infinite(ratio),
!is.na(ratio)
)
p6 <- df %>%
mutate(
Country = factor(
Country,
levels = df %>%
dplyr::select(Country, Days, ratio) %>%
group_by(Country) %>%
mutate(minusT = as.integer(Days - max(Days))) %>%
pivot_wider(Country, minusT, values_from = ratio) %>%
as.data.frame() %>%
column_to_rownames("Country") %>%
as.matrix %>%
.[matrixStats::rowCounts(.,value = NA) < ncol(.),] %>%
dist() %>%
hclust() %>%
as.dendrogram() %>%
labels()
)
) %>%
rename_all(str_to_title) %>%
ggplot(
aes(
x = Days, y = Country, fill = Ratio,
conf = Confirmed,
rec = Recovered,
deaths = Deaths,
date = Date,
)
) +
geom_raster() +
scale_fill_gradient2(
low = "red", high = "green", na.value = "grey80"
) +
scale_x_continuous(
expand = expand_scale(0, 0)
) +
scale_y_discrete(
expand = expand_scale(0, 0)
) +
labs(
x = glue(
"Days since passing {startingPoint} cases/million"
),
y = c(),
fill = "Ratio"
) +
theme(
panel.grid = element_blank()
)
cp6 <- glue(
"*Comparison of recoveries and fatalities as a time course, beginning at the day cases exceeded {startingPoint} confirmed cases / million.
Countries are only shown if data is available for more than {minDays} days since passing this threshold.
The ratio of recoveries to fatalities is shown on the log~2~ scale, such that when fatalities outnumber recoveries, the colour become more red.
Conversely if the recoveries outnumber the fatalities the colour will become more green.
If these two numbers are approximately equal, the colour will be white.
As the numbers are on the log~2~ scale, a ratio of 2, would indicate 2^2^ = 4 times the number of __recoveries to fatalities__.
Similarly a ratio of -2 would indicate that fatalities outnumber recoveries by a factor of 2^2^ = 4.
Ratios are only available once there is one or more confirmed recoveries in addition to one or more confirmed fatalities.*"
)
A time-course of the relationship between recoveries and fatalities is shown. Series of days which extend deeply into the red may possibly indicate where medical facilities are overloaded. However, it is possible that during these times more attention is focussed on keeping critically ill patients alive than confirming a recovery. It should be noted that many of the countries which are deep into the red at the current time are affluent Western countries. It is also worth noting that despite the high death toll in Italy, recoveries outnumber fatalities for the majority of this time-course.
ggplotly(p6)
Comparison of recoveries and fatalities as a time course, beginning at the day cases exceeded 4 confirmed cases / million. Countries are only shown if data is available for more than 9 days since passing this threshold. The ratio of recoveries to fatalities is shown on the log2 scale, such that when fatalities outnumber recoveries, the colour become more red. Conversely if the recoveries outnumber the fatalities the colour will become more green. If these two numbers are approximately equal, the colour will be white. As the numbers are on the log2 scale, a ratio of 2, would indicate 22 = 4 times the number of recoveries to fatalities. Similarly a ratio of -2 would indicate that fatalities outnumber recoveries by a factor of 22 = 4. Ratios are only available once there is one or more confirmed recoveries in addition to one or more confirmed fatalities.
minDeaths <- 10
minDays <- 3
deathPlot <- confirmed %>%
left_join(deaths) %>%
group_by(Country, date) %>%
summarise_at(
vars(confirmed, deaths),
sum
) %>%
ungroup() %>%
dplyr::filter(deaths >= minDeaths) %>%
group_by(Country) %>%
mutate(Days = as.integer(date - min(date))) %>%
dplyr::filter(
max(Days) > minDays
) %>%
ungroup() %>%
rename_all(str_to_title) %>%
ggplot(
aes(
x = Days, y = Deaths, colour = Country,
label = Date, conf = Confirmed)
) +
geom_point() +
geom_line() +
scale_y_log10() +
labs(
x = glue("Days Since Passing {minDeaths} Deaths"),
y = "Cumulative Fatalities"
)
ggplotly(deathPlot)
As with the spread of the virus, fatalities also grow at an exponential rate. Any slowing in the growth of fatalities is an accurate marker for when the spread of the virus has definitively slowed, despite being a significantly lagging marker. Cumulative fatalities are only shown for countries which have seen 10 or more fatalities for a period beyond 3 days.
Australian State populations were taken from the ABS Website and were accurate in Sept 2019. The difference with previous estimates used above was within 0.04%, and as such no adjustments were made.
A series of complimentary charts regarding the spread of COVID-19 are available from the ABC website.
Australia’s spread of the virus appears in the previous plots as marginally slower than many other countries, grouping together with countries such as Greece, Israel and the UK.
confirmed %>%
dplyr::filter(
Country == "Australia",
date >= max(date) - 1
) %>%
group_by(`Province/State`) %>%
mutate(
d = diff(confirmed) / min(confirmed)
) %>%
pivot_wider(
id_cols = "Province/State",
names_from = date,
values_from = c(confirmed, d)
) %>%
set_names(
str_remove(names(.), "confirmed_")
) %>%
dplyr::select(seq_len(ncol(.) - 1)) %>%
set_names(
str_replace(names(.), "d_.+", "%Increase")
) %>%
arrange(desc(`%Increase`)) %>%
pander(
justify = "lrrr",
caption = paste(
"*Confirmed cases reported by each state at time of preparation.",
"Any states with unchanged values may indicate issues with the automated data sources, such as health.gov.au or JHU, or that these states have not yet reported for the day.*"
)
)
| Province/State | 2020-03-25 | 2020-03-26 | %Increase |
|---|---|---|---|
| South Australia | 170 | 235 | 0.3824 |
| Australian Capital Territory | 39 | 53 | 0.359 |
| Northern Territory | 6 | 8 | 0.3333 |
| Western Australia | 175 | 231 | 0.32 |
| New South Wales | 1,029 | 1,219 | 0.1846 |
| Tasmania | 36 | 42 | 0.1667 |
| Victoria | 466 | 520 | 0.1159 |
| Queensland | 443 | 493 | 0.1129 |
ausPops <- tribble(
~State, ~Population,
"New South Wales", 8117976,
"Victoria", 6629870,
"Queensland", 5115451,
"South Australia", 1756494,
"Western Australia", 2630557,
"Tasmania", 535500,
"Northern Territory", 245562,
"Australian Capital Territory", 428060
)
minRate <- 4
p5 <- confirmed %>%
dplyr::filter(
Country == "Australia"
) %>%
dplyr::rename(State = `Province/State`) %>%
left_join(ausPops) %>%
mutate(
Rate = round(1e6*confirmed / Population, 2),
Date = format.Date(date, "%d-%B")
) %>%
dplyr::filter(
!is.na(Population),
!str_detect(State, "Northern"),
Rate > minRate
) %>%
arrange(date) %>%
mutate(
State = fct_inorder(State)
) %>%
dplyr::rename(
Confirmed = confirmed
) %>%
ggplot(
aes(
x = date, y = Rate, colour = State,
label = Date, key = Confirmed
)
) +
geom_point() +
geom_smooth(
method = "lm",
se = FALSE,
show.legend = FALSE
) +
geom_line(linetype = 3) +
scale_y_log10() +
labs(
x = "Date",
y = "Confirmed Infection Rate (cases/million)"
)
# Hide the tooltip from the regression lines
n <- length(levels(p5$data$State))
p5 <- ggplotly(p5, tooltip = c("Date", "Rate", "State", "Confirmed"))
regIndex <- seq(n + 1, length.out = n, by = 1)
p5$x$data[regIndex] <- lapply(
p5$x$data[regIndex],
function(x){
x$hoverinfo <- "none"
x
})
p5
Infection rates for each state with data beginning for each state once 4 confirmed cases /million was exceeded. Linear regression lines are shown for each state as solid lines, with NSW perhaps showing a slightly increased rate of infection within the population. Once again, the y-axis is on a logarithmic scale indicating exponential growth is occurring. States are shown in order of the initial date they passed 4 cases/million. Due to the low population size in the NT, these cases were omitted.
In order to test whether the infection rates are different between states, a linear regression model was fit. \(\log_{10}\)(Cumulative Confirmed Infection Rate) was assigned as the response variable with predictor variables being the State and Date. Each State was assigned its own intercept and slope by use of an interaction term (i.e. State:date). Given the potentially larger slope in NSW, this state was set as the baseline, with each other slope (i.e. interaction term) being presented as the difference in slope between each state and NSW. In this way comparisons against NSW were performed, but no comparisons between other states were performed.
Differences in the State-level intercepts are not particularly relevant, apart from capturing the initial time at which cumulative confirmed infection rate exceeded 4 cases / million. Differences between State-level slopes and NSW however, are of great interest, and as such only the slopes are shown. For NSW (Term = date) this captures the actual slope of the daily change in infection rate, whilst for all other States, this represents the difference between the daily change in infection rate for that State in comparison to NSW. Only differences in slope with an Bonferroni-adjusted p-value < 0.05 are of particular interest. For those States which appear to be of interest, a negative value indicates an infection rate increasing more slowly than NSW, whilst a positive value indicates the opposite..
lm <- confirmed %>%
dplyr::filter(
Country == "Australia"
) %>%
dplyr::rename(State = `Province/State`) %>%
left_join(ausPops) %>%
mutate(
Rate = round(1e6*confirmed / Population, 2),
Date = format.Date(date, "%d-%B")
) %>%
dplyr::filter(
!is.na(Population),
!str_detect(State, "Northern"),
Rate > minRate
) %>%
arrange(desc(confirmed)) %>%
mutate(
State = fct_inorder(State)
) %>%
dplyr::rename(
Confirmed = confirmed
) %>%
with(
lm(log10(Rate) ~ (State + date)^2)
)
lm %>%
tidy() %>%
mutate(
adjP = p.adjust(p.value, method = "bonf"),
term = str_remove_all(term, "State")
) %>%
rename(
Term = term,
Estimate = estimate,
SE = std.error,
`T` = statistic,
p = p.value
) %>%
dplyr::filter(
str_detect(Term, ":date")
) %>%
mutate(
Term = str_remove(Term, ":date"),
Estimate = sprintf("%.4f", Estimate),
SE = sprintf("%.3f", SE),
`T` = sprintf("%.2f", `T`),
p = case_when(
p < 1e-4 ~ sprintf("%.2e", p),
p >= 1e-4 ~ sprintf("%.4f", p)
),
adjP = case_when(
adjP < 1e-4 ~ sprintf("%.2e", adjP),
adjP >= 1e-4 ~ sprintf("%.4f", adjP)
)
) %>%
rename(
`p~bonf~` = adjP
) %>%
pander(
justify = "lrrrrr",
emphasize.strong.rows = which(
as.numeric(.$`p~bonf~`) < 0.05 & .$Term != "date"
),
caption = paste(
"*Results of linear regression analysis",
"comparing the slopes of lines which track __daily",
"change in cumulative confirmed infection rates__,",
"scaled for population size in each state.",
"Intercept terms are not shown.",
"All states are shown in comparison to NSW.",
"Any highlighted state indicates an infection rate which is increasing at a different daily rate to NSW.",
"Positive values in the Estimate column indicate a greater rate the NSW, whilst a negative value in the Estimate column indicates a slower growth rate than NSW.",
"Diagnostic plots visually appeared",
"to be within acceptable bounds for real data,",
"the fitted model also checked using the Shapiro-Wilk Test for normality",
paste0(
"(p = ",
sprintf(
"%.4f", shapiro.test(resid(lm))$p.value
),
")*."
)
)
)
| Term | Estimate | SE | T | p | pbonf |
|---|---|---|---|---|---|
| Victoria | 0.0073 | 0.005 | 1.58 | 0.1184 | 1.0000 |
| Queensland | 0.0061 | 0.005 | 1.31 | 0.1935 | 1.0000 |
| South Australia | 0.0091 | 0.004 | 2.25 | 0.0268 | 0.3748 |
| Western Australia | 0.0141 | 0.005 | 3.03 | 0.0032 | 0.0442 |
| Australian Capital Territory | 0.0776 | 0.006 | 12.61 | 1.33e-21 | 1.87e-20 |
| Tasmania | -0.0083 | 0.004 | -2.05 | 0.0430 | 0.6017 |
coef(lm) %>%
enframe(name = "Term", value = "coef") %>%
dplyr::filter(str_detect(Term, "date")) %>%
mutate(
slope = case_when(
str_detect(Term, "State") ~ coef[[1]] + coef,
!str_detect(Term, "State") ~ coef
),
Term = str_remove(Term, "State"),
State = str_remove(Term, ":date"),
State = str_replace(State, "date", "New South Wales")
) %>%
left_join(
confirmed %>%
dplyr::filter(
Country == "Australia",
date == max(date)
) %>%
rename(State = `Province/State`) %>%
dplyr::select(State, date, confirmed)
) %>%
mutate(
predicted = round(10^slope*confirmed, 0),
slope = percent(10^slope, accuracy = 0.1),
) %>%
dplyr::select(
State,
`Expected Daily Increase` = slope,
`Most Recent` = confirmed,
`Predicted For Next Day` = predicted
) %>%
pander(
justify = "lrrr",
caption = paste(
"Results from linear regression shown as the",
"expected daily increase in cases for the",
"current exponential growth phase.",
"Most recent values were taken from the official",
"figures on",
confirmed %>%
dplyr::filter(Country == "Australia") %>%
.[["date"]] %>%
max() %>%
format("%d %B, %Y."),
"Predictions are for",
confirmed %>%
dplyr::filter(Country == "Australia") %>%
.[["date"]] %>%
max() %>%
add(1) %>%
format("%d %B, %Y."),
"Standard errors for predictions have not been included."
)
)
| State | Expected Daily Increase | Most Recent | Predicted For Next Day |
|---|---|---|---|
| New South Wales | 121.7% | 1,219 | 1,484 |
| Victoria | 123.8% | 520 | 644 |
| Queensland | 123.4% | 493 | 609 |
| South Australia | 124.3% | 235 | 292 |
| Western Australia | 125.7% | 231 | 290 |
| Australian Capital Territory | 145.5% | 53 | 77 |
| Tasmania | 119.4% | 42 | 50 |
tested <- list.files("tested", pattern = "tsv", full.names = TRUE) %>%
sort() %>%
.[length(.)] %>%
read_tsv()
nswTests <- NA_real_
nswUrl <- glue(
"https://www.health.nsw.gov.au/news/Pages/{str_remove_all(Sys.Date(), '-')}_00.aspx"
)
if (url.exists(nswUrl)){
nswTests <- nswUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all("//td[contains(@class, 'moh-rteTableFooterOddCol-6')]") %>%
html_text()
if (length(nswTests)) {
nswTests <- nswTests[[1]] %>%
str_remove_all(",") %>%
as.numeric()
}
}
qldTests <- NA_real_
qldUrl <- glue(
"https://www.qld.gov.au/health/conditions/health-alerts/coronavirus-covid-19/current-status/current-status-and-contact-tracing-alerts#symptoms-testing"
)
if (url.exists(qldUrl)){
qldTests <- qldUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all("//table[contains(@id, 'table59454')]") %>%
html_text() %>%
str_replace_all(
".+Samples testedTotal([0-9,]+)Tests.+",
"\\1"
) %>%
str_remove_all(",") %>%
as.numeric()
}
vicTests <- NA_real_
vicUrl <- glue(
"https://www.dhhs.vic.gov.au/coronavirus-update-victoria-{format(Sys.Date(), '%d-%B-%Y')}"
) %>%
str_to_lower()
if (url.exists(vicUrl)){
vicTests <- vicUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all("//div[contains(@class, 'field field--name-field-dhhs')]") %>%
html_text() %>%
str_split("\n") %>%
.[[1]] %>%
str_subset("tested") %>%
str_replace_all(
".+than ([0-9,]+) Victorians have been tested.+",
"\\1"
) %>%
str_remove_all(",") %>%
as.numeric()
}
waTests <- NA_real_
waUrl <- glue(
"https://ww2.health.wa.gov.au/Media-releases/2020/COVID19-update-{format(Sys.Date(), '%d-%B-%Y')}"
)
if (url.exists(waUrl)){
waTxt <- waUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all("//div[contains(@id, 'contentArea')]") %>%
html_text() %>%
str_split("\n") %>%
.[[1]] %>%
str_trim()
waConf <- waTxt %>%
str_subset("total to") %>%
str_replace_all(".+total to ([0-9,]+).", "\\1") %>%
as.numeric()
waNeg <- waTxt %>%
str_subset("have tested") %>%
str_replace_all(".+ ([0-9,]+) Western.+", "\\1") %>%
str_remove_all(",") %>%
as.numeric()
waTests <- waConf + waNeg
}
saTests <- NA_real_
saUrl <- glue(
"https://www.sahealth.sa.gov.au/wps/wcm/connect/public+content/sa+health+internet/about+us/news+and+media/all+media+releases/covid-19+update+{format(Sys.Date(), '%d+%B++%Y')}"
) %>%
str_to_lower()
if (url.exists(saUrl)) {
saTests <- saUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all("//div[contains(@class, 'wysiwyg')]") %>%
html_text() %>%
str_split("\n")
if (length(saTests)) {
saTests <- saTests[[1]] %>%
str_trim() %>%
str_subset("more than.+tests") %>%
str_replace_all(".+ ([0-9,]+) tests.", "\\1") %>%
str_remove_all(",") %>%
as.numeric()
}
}
actTests <- NA_real_
actUrl <- "https://www.health.act.gov.au/about-our-health-system/novel-coronavirus-covid-19"
if (url.exists(actUrl)){
actTests <- actUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all(
"//div[contains(@class, 'statuscontent')]"
) %>%
html_text() %>%
str_split("\n") %>%
.[[1]] %>%
.[. != ""] %>%
str_split_fixed(":", 2) %>%
apply(2, str_trim) %>%
.[,2] %>%
as.numeric() %>%
sum()
}
tasTests <- NA_real_
tasUrl <- glue(
"https://www.dhhs.tas.gov.au/news/2020/coronavirus_update_{format(Sys.Date(), '%d_%B_%Y')}"
) %>%
str_to_lower()
if (url.exists(tasUrl)) {
tasTests <- tasUrl %>%
read_html() %>%
html_nodes("body") %>%
xml_find_all(
"//div[contains(@id, 'content_div_117564')]"
) %>%
html_text() %>%
str_split("\n") %>%
.[[1]] %>%
str_trim() %>%
.[. != ""] %>%
str_replace_all(".+Tasmania has conducted ([0-9]+) tests.+", "\\1") %>%
as.numeric()
}
tested <- c(
"New South Wales" = nswTests,
"Queensland" = qldTests,
"Victoria" = vicTests,
"Western Australia" = waTests,
"South Australia" = saTests,
"Australian Capital Territory" = actTests,
"Tasmania" = tasTests
) %>%
enframe(
name = "State", value = "Tested"
) %>%
mutate(Tested = as.numeric(Tested)) %>%
dplyr::filter(!is.na(Tested)) %>%
bind_rows(tested) %>%
group_by(State) %>%
summarise(Tested = max(Tested, na.rm = TRUE))
tested %>%
write_tsv(
glue(
"tested/tested_{format(Sys.Date(), '%Y%m%d')}.tsv"
)
)
Testing numbers were initially sourced on 24th March, 2020 from manual inspection of individual press releases for NSW, QLD, VIC, WA, SA, TAS and the ACT. No testing figures have yet been released by the NT government.
Updates on testing numbers beyond the initial values were performed automatically using the above code which probes each state’s official releases for the latest values, and updates where found. The number of tested individuals in each state was then assessed as a function of State population size. All results are valid at the time of report generation.
tested %>%
left_join(
latestAU,
by = c("State" = "Province/State")
) %>%
rowwise() %>%
mutate(
Tested = max(Tested, tested, na.rm = TRUE)
) %>%
ungroup() %>%
dplyr::select(State, Tested) %>%
left_join(
confirmed %>%
dplyr::filter(
date == max(date)
) %>%
rename(
State = `Province/State`
)
) %>%
left_join(ausPops) %>%
mutate(
`Percent Tested` = Tested / Population,
Positive = confirmed / Tested,
Negative = 1 - Positive
) %>%
dplyr::select(
State, Population,
Confirmed = confirmed,
`Total Tested` = Tested,
contains("Percent"),
ends_with("ive")
) %>%
arrange(desc(`Percent Tested`)) %>%
mutate_at(
vars(contains("Percent"), ends_with("ive")),
percent,
accuracy = 0.01
) %>%
pander(
justify = "lrrrrrr",
caption = glue(
"*COVID-19 testing scaled by state population size.
Confirmed cases are assumed to be the tests returning a positive result.
The current numbers available for some states are a lower limit, and as such, the proportion of the population tested is likely to be higher, as is the proportion of tests returning a negative result.*"
)
)
| State | Population | Confirmed | Total Tested | Percent Tested | Positive | Negative |
|---|---|---|---|---|---|---|
| South Australia | 1,756,494 | 235 | 22,000 | 1.25% | 1.07% | 98.93% |
| New South Wales | 8,117,976 | 1,219 | 74,006 | 0.91% | 1.65% | 98.35% |
| Australian Capital Territory | 428,060 | 53 | 3,272 | 0.76% | 1.62% | 98.38% |
| Queensland | 5,115,451 | 493 | 38,860 | 0.76% | 1.27% | 98.73% |
| Western Australia | 2,630,557 | 231 | 11,519 | 0.44% | 2.01% | 97.99% |
| Victoria | 6,629,870 | 520 | 26,900 | 0.41% | 1.93% | 98.07% |
| Tasmania | 535,500 | 42 | 1,272 | 0.24% | 3.30% | 96.70% |
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: plotly(v.4.9.2), pander(v.0.6.3), RCurl(v.1.98-1.1), rvest(v.0.3.5), xml2(v.1.2.2), jsonlite(v.1.6.1), glue(v.1.3.1), broom(v.0.5.4), ggrepel(v.0.8.1), matrixStats(v.0.55.0), scales(v.1.1.0), lubridate(v.1.7.4), magrittr(v.1.5), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.4), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.2), tibble(v.2.1.3), ggplot2(v.3.2.1) and tidyverse(v.1.3.0)
loaded via a namespace (and not attached): httr(v.1.4.1), viridisLite(v.0.3.0), modelr(v.0.1.6), shiny(v.1.4.0), assertthat(v.0.2.1), highr(v.0.8), selectr(v.0.4-2), cellranger(v.1.1.0), yaml(v.2.2.1), pillar(v.1.4.3), backports(v.1.1.5), lattice(v.0.20-40), digest(v.0.6.25), promises(v.1.1.0), colorspace(v.1.4-1), htmltools(v.0.4.0), httpuv(v.1.5.2), pkgconfig(v.2.0.3), haven(v.2.2.0), xtable(v.1.8-4), later(v.1.0.0), generics(v.0.0.2), farver(v.2.0.3), ellipsis(v.0.3.0), withr(v.2.1.2), lazyeval(v.0.2.2), cli(v.2.0.1), crayon(v.1.3.4), readxl(v.1.3.1), mime(v.0.9), evaluate(v.0.14), fs(v.1.3.1), fansi(v.0.4.1), MASS(v.7.3-51.5), nlme(v.3.1-144), Cairo(v.1.5-10), tools(v.3.6.3), data.table(v.1.12.8), hms(v.0.5.3), lifecycle(v.0.1.0), munsell(v.0.5.0), reprex(v.0.3.0), compiler(v.3.6.3), rlang(v.0.4.4), grid(v.3.6.3), rstudioapi(v.0.11), htmlwidgets(v.1.5.1), crosstalk(v.1.0.0), bitops(v.1.0-6), labeling(v.0.3), rmarkdown(v.2.1), gtable(v.0.3.0), DBI(v.1.1.0), curl(v.4.3), R6(v.2.4.1), zoo(v.1.8-7), knitr(v.1.28), fastmap(v.1.0.1), stringi(v.1.4.6), Rcpp(v.1.0.3), vctrs(v.0.2.3), dbplyr(v.1.4.2), tidyselect(v.1.0.0) and xfun(v.0.12)